################################################################################ 
#
# Counterspeech encouraging users to adopt the perspective of minority 
# groups reduces hate speech and its amplification on social media
#
# Data creation: Lasso controls
#
################################################################################ 


rm(list = ls())

################################################################################ 
#  LIBRARIES
################################################################################ 

library(dplyr)
library(tidyr)
library(readr)
library(fastDummies)
library(data.table)
library(lubridate)


################################################################################ 
#   DATA AND FOLDER
################################################################################ 

wd = ''
wd_res = paste0(wd, '/results')
wd_data =  paste0(wd, '/data')


# Upload the data
data = read.csv(paste0(wd_data, '/dataset_analysis.csv'))
data = data[!(data$user_deleted == "yes"),]


data$orig_date = yday(as.Date(data$orig_tweet_created_at))

################################################################################ 
#   VARIABLES
################################################################################



# Controls in the PAP
controls = c("account_age", 'tox_num_pre',  "followers_count",
             "pre_treat_tweet_length", "total_tweets_pre", "friends_count", 
             "orig_date")


cont_dummies = lapply(data[controls], function(x) 
  as.factor(cut(x, breaks=5, labels = F, include.lowest = T))) %>% as.data.frame()

# 
# data$account_age_cuts = as.factor(cut(data$account_age, breaks=5))
# data$tox_num_pre_cuts = as.factor(cut(data$tox_num_pre, breaks=5))
# data$followers_count_cuts = as.factor(cut(data$followers_count, breaks=5))
# data$pre_treat_tweet_length_cuts = as.factor(cut(data$pre_treat_tweet_length, breaks=5))
# data$total_tweets_pre_cuts = as.factor(cut(data$total_tweets_pre, breaks=5))
# data$friends_count_cuts = as.factor(cut(data$friends_count, breaks=5))
# 
# controls_cuts = paste0(controls, "_cuts")


cont_dummiespr = dummy_cols(cont_dummies)[,-(1:6)]


# rep$pre_HS_dummy = ifelse(rep$num_hate_tweets_pre>0, 1, 0)
# 
# rep$pre_hate_share_tot = (rep$num_hate_tweets_pre) / (rep$num_tweets_pre)
# rep$pre_hate_share_tot[rep$num_tweets_pre==0] = 0
# 
# data = rep[(is.na(rep$deleted_suspended)), ]

################################################################################ 
#   Select controls for quintile transformation
################################################################################ 

# dum = c("display_text_width", "favorite_count", "retweet_count", 
#         "followers_count", "friends_count" , "statuses_count", "favourites_count",
#         "lang_share")
# 
# cont_dummies = lapply(data[dum], function(x) 
#   as.factor(cut(x, quantile(x, na.rm=T), labels = F, include.lowest = T))) %>% as.data.frame()
# 
# rm(dum)
# 
# data['year_agg'] = cut(as.numeric(data$year), c(2021, 2019, 2015, 2005) , labels = F, include.lowest = T)


# ################################################################################ 
# #   Create dummies
# ################################################################################ 
# 
# source_dummies = dummy_cols(data['source_main'])[,-1]
# language_dummies = dummy_cols(data['lang'])[,-1]
# year_dummies = dummy_cols(data['year_agg'])[,-1]
# day_dummies = dummy_cols(data['day'])[,-1]  # day the tweet is created
# dum_controls = c("location_yes")  
# cont_dummiespr = dummy_cols(cont_dummies)[,-(1:8)]
# 
# 
# ################################################################################ 
# #   Asinh transformation for some count outcomes
# ################################################################################ 
# 
# controls = c('pre_hate_share_tot', 'num_tweets_pre', # 'pre_hate_share_target', 
#              'num_hate_tweets_pre', 'num_keywords_pre', "tox_vader_neu_pre", #'tox_bert_pre', 
#              "display_text_width", "favorite_count", "retweet_count", 
#              "followers_count", "friends_count" , "listed_count",
#              "statuses_count", "favourites_count", "lang_share")

# data[controls] = lapply(data[controls], as.numeric)
# data[controls] = lapply(data[controls], asinh)


################################################################################ 
#   Final Dataset
################################################################################ 

data[controls] = lapply(data[controls], asinh)


# only variables we want interacted 
X = cbind.data.frame(data[controls], cont_dummiespr)

# ,
#                      data[dum_controls],
#                      language_dummies,
#                      year_dummies,
#                      source_dummies)
X <- lapply(X, as.numeric) %>% as.data.frame()

# find first interactions
C = data['user_id']
done = c()

for (n in names(X)){
  for (m in names(X)){
    
    name = sort(c(n, m), decreasing=T)
    name = paste0(name[1], '_', name[2])
    
    if (name %in% done){
      next
    }else{
      prod <- X[n] * X[m]
      C[name] = prod
      done = c(done, name)
    }
  }}


# Add also non interacted variables
final = cbind.data.frame(exp_id=C[, 1], X, C) %>% as.data.frame()

# final[names(language_dummies)][is.na(final[names(language_dummies)])] = 0
# final[names(year_dummies)][is.na(final[names(year_dummies)])] = 0
# final[names(source_dummies)][is.na(final[names(source_dummies)])] = 0

# Impute missing as averages
for(i in 1:ncol(final)){
  final[is.na(final[,i]), i] <- mean(final[,i], na.rm = TRUE)
}


# exclude all constant variables
final = final[, which(apply(final, 2, var) != 0)] 

final <- as.data.frame(lapply(final, factor))

setwd(wd_data)
write_rds(final, 'data_lasso.RDS')



